#!/bin/bash

export BASE_DATA_PATH=/home/mixagol/data
export CUR_DATA_DIR=${BASE_DATA_PATH}/1_databases
export LC_ALL=C

1_databases/create_genom_db.py  ${BASE_DATA_PATH}/0_src/raw.db ${CUR_DATA_DIR}/genom.db
1_databases/create_genomes.py  ${CUR_DATA_DIR}/genom.db | gzip > ${CUR_DATA_DIR}/genomes.txt.gz

#1_databases/create_gen_desc_db.py  ${BASE_DATA_PATH}/0_src/raw.db ${CUR_DATA_DIR}/gen_desc.db
1_databases/create_gene_info.py ${BASE_DATA_PATH}/0_src/raw.db ${CUR_DATA_DIR}/genes_info.txt
sort -t$'\t' -k1,1 -o ${CUR_DATA_DIR}/genes_info.txt ${CUR_DATA_DIR}/genes_info.txt
#1_databases/create_genes.py  ${CUR_DATA_DIR}/genom.db  ${CUR_DATA_DIR}/gen_desc.db | gzip > ${CUR_DATA_DIR}/genes.txt.gz

cat ${CUR_DATA_DIR}/genes_info.txt \
    | awk -F'\t' '{if (($4 != "__NULL__") && ($4 != "hypothetical protein") && (index($4, "unknown") != 1)) print $1}' \
    > ${CUR_DATA_DIR}/genes_good.txt

zcat ${CUR_DATA_DIR}/genomes.txt.gz | awk -F '\t' '{print ">"$1"\n"$2}' > ${CUR_DATA_DIR}/genomes.faa
join -t$'\t' \
    ${CUR_DATA_DIR}/genes_good.txt \
    <(zcat ${CUR_DATA_DIR}/genes.txt.gz) \
    | awk -F '\t' '{print ">"$1"\n"$2}' \
    > ${CUR_DATA_DIR}/genes.faa

export BLAST_DB_DIR=${CUR_DATA_DIR}/blast_db
mkdir -p $BLAST_DB_DIR
~/ncbi-blast-2.2.25+/bin/makeblastdb \
    -dbtype nucl \
    -in ${CUR_DATA_DIR}/genomes.faa \
    -title GENOMES_DB \
    -input_type fasta \
    -hash_index \
    -out $BLAST_DB_DIR/GENOMES_DB \
    -max_file_sz 15GB
~/ncbi-blast-2.2.25+/bin/makeblastdb \
    -dbtype nucl \
    -in ${CUR_DATA_DIR}/genes.faa \
    -title GENES_DB \
    -input_type fasta \
    -hash_index \
    -out $BLAST_DB_DIR/GENES_DB \
    -max_file_sz 15GB

export BLAST_DB_DIR=${CUR_DATA_DIR}/wu_blast_db
mkdir -p $BLAST_DB_DIR
~/wu-blast/ab-formatdb \
    -p F \
    -i ${CUR_DATA_DIR}/genomes.faa \
    -t GENOMES_DB \
    -n $BLAST_DB_DIR/GENOMES_DB
~/wu-blast/ab-formatdb \
    -p F \
    -i ${CUR_DATA_DIR}/genes.faa \
    -t GENES_DB \
    -n $BLAST_DB_DIR/GENES_DB

rm ${CUR_DATA_DIR}/genomes.faa
rm ${CUR_DATA_DIR}/genes.faa

mkdir ${CUR_DATA_DIR}/genes_faa
zcat ${CUR_DATA_DIR}/genes.txt.gz | awk -F '\t' '{print ">"$1"\n"$2}' \
    | split -d -a 5 -l 1000 - ${CUR_DATA_DIR}/genes_faa/
ls ${CUR_DATA_DIR}/genes_faa/* | xargs -n1 -P1 -I {} mv {} {}".faa"


mkdir ${CUR_DATA_DIR}/genes_good_faa
join -t$'\t' \
    ${CUR_DATA_DIR}/genes_good.txt \
    <(zcat ${CUR_DATA_DIR}/genes.txt.gz) | awk -F '\t' '{print ">"$1"\n"$2}' \
    | split -d -a 5 -l 1000 - ${CUR_DATA_DIR}/genes_good_faa/
ls ${CUR_DATA_DIR}/genes_good_faa/* | xargs -n1 -P1 -I {} mv {} {}".faa"

